# 随机森林包
library(randomForest)
# 数据整理和绘图
library(tidyverse)
# 查看数据整体
library(skimr)
# 数据缺失情况 缺失值填充
library(DataExplorer)
# 数据拆分 模型评估 主要是做机器学习最常用包
library(caret)
library(pROC)

#https://www.bilibili.com/video/BV1Ag4y157rz
# 下面代码是他带着我学习 是个大神 好好学习下 2023年5月21日09:11:03 
data1 <-(iris)
skim(data1)
str(data1)
hist(data1$Sepal.Width,breaks = 5)
plot_missing(data1)
colnames(data1)

# 进行拆分数据 70% 训练 30% 验证
# 拆分数据 主要是复现数据
set.seed(40)
trains <- createDataPartition(y = data1$Species,p = 0.7,list = FALSE)
trainsdata <- data1[trains,]
testdata <- data1[-trains,]
hist(trainsdata$Sepal.Width,breaks = 10)
hist(testdata$Sepal.Width,breaks = 10)
formula <- as.formula(Species ~ 
             Sepal.Length + Sepal.Width + Petal.Length+Petal.Width)

set.seed(40)
trainstreeResult <- randomForest(Species ~ 
                             Sepal.Length + Sepal.Width + Petal.Length+Petal.Width,
                           data = trainsdata,
                           ntree = 500,
                           mtry = 4, # 自变量最大数
                           importance = T)
set.seed(40)
predicttest <-predict(trainstreeResult,newdata =testdata )
testdatatreeResult <- randomForest(Species ~ 
                                   Sepal.Length + Sepal.Width + Petal.Length+Petal.Width,
                                 data = testdata,
                                 ntree = 500,
                                 mtry = 4, # 自变量最大数
                                 importance = T)

trainstreeResult
plot(trainstreeResult,main = "EASDASDADS")
importance(trainstreeResult)
varImpPlot(trainstreeResult,main = "variable Importance plot",type = 1)
varImpPlot(trainstreeResult,main = "variable Importance plot",type = 2)
# 验证及
testdatatreeResult
plot(testdatatreeResult,main = "EASDASDADS")
importance(testdatatreeResult)
varImpPlot(predicttest,main = "variable Importance plot",type = 1)
varImpPlot(predicttest,main = "variable Importance plot",type = 2)

############################################
set.seed(71)
iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE,
                        proximity=TRUE)
print(iris.rf)
## Look at variable importance:
round(importance(iris.rf), 2)
## Do MDS on 1 - proximity:
iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE)
op <- par(pty="s")
pairs(cbind(iris[,1:4], iris.mds$points), cex=0.6, gap=0,
      col=c("red", "green", "blue")[as.numeric(iris$Species)],
      main="Iris Data: Predictors and MDS of Proximity Based on RandomForest")
par(op)
print(iris.mds$GOF)

## The `unsupervised' case:
set.seed(17)
iris.urf <- randomForest(iris[, -5])
MDSplot(iris.urf, iris$Species)

## stratified sampling: draw 20, 30, and 20 of the species to grow each tree.
(iris.rf2 <- randomForest(iris[1:4], iris$Species, 
                          sampsize=c(20, 30, 20)))

## Regression:
## data(airquality)
set.seed(131)
ozone.rf <- randomForest(Ozone ~ ., data=airquality, mtry=3,
                         importance=TRUE, na.action=na.omit)
print(ozone.rf)
## Show "importance" of variables: higher value mean more important:
round(importance(ozone.rf), 2)

## "x" can be a matrix instead of a data frame:
set.seed(17)
x <- matrix(runif(5e2), 100)
y <- gl(2, 50)
(myrf <- randomForest(x, y))
(predict(myrf, x))

## "complicated" formula:
(swiss.rf <- randomForest(sqrt(Fertility) ~ . - Catholic + I(Catholic < 50),
                          data=swiss))
(predict(swiss.rf, swiss))
## Test use of 32-level factor as a predictor:
set.seed(1)
x <- data.frame(x1=gl(53, 10), x2=runif(530), y=rnorm(530))
(rf1 <- randomForest(x[-3], x[[3]], ntree=10))

## Grow no more than 4 nodes per tree:
(treesize(randomForest(Species ~ ., data=iris, maxnodes=4, ntree=30)))

## test proximity in regression
iris.rrf <- randomForest(iris[-1], iris[[1]], ntree=101, proximity=TRUE, oob.prox=FALSE)
str(iris.rrf$proximity)

## Using weights: make versicolors having 3 times larger weights
iris_wt <- ifelse( iris$Species == "versicolor", 3, 1 )
set.seed(15)
iris.wcrf <- randomForest(iris[-5], iris[[5]], weights=iris_wt, keep.inbag=TRUE)
print(rowSums(iris.wcrf$inbag))
set.seed(15)
iris.wrrf <- randomForest(iris[-1], iris[[1]], weights=iris_wt, keep.inbag=TRUE)
print(rowSums(iris.wrrf$inbag))
# }


# 大神带我结束 下面的是我自己学的随记

#读取 OTUs 丰度表
setwd('F:/Rproject/r-language/paper1Somke/200205-R包randomForest的随机森林分类模型以及对重要变量的选择')
otu <- read.table('otu_table.txt', sep = '\t', row.names = 1, header = TRUE, fill = TRUE)

#过滤低丰度 OTUs 类群，它们对分类贡献度低，且影响计算效率
#120 个样本，就按 OTUs 丰度的行和不小于 120 为准吧
otu <- otu[which(rowSums(otu) >= 120), ]

#合并分组，得到能够被 randomForest 识别计算的格式
group <- read.table('group.txt', sep = '\t', row.names = 1, header = TRUE, fill = TRUE)
otu <- data.frame(t(otu))
otu_group <- cbind(otu, group)


# 下面方法是特定种子数 执行一次随机需要生成下 在每次种子数特定情况下每次随机的数是一样的
set.seed(6)
x <- rnorm(3)
set.seed(8)
y <- rnorm(3)
x==y
x
y
#将总数据集分为训练集（占 70%）和测试集（占 30%）
set.seed(123)
select_train <- sample(120, 120*0.7)
otu_train <- otu_group[select_train, ]
otu_test <- otu_group[-select_train, ]

####################################################
#randomForest 包的随机森林
library(randomForest)
library(randomForest)
# 相较于其它分类方法，随机森林通常具有如下优势：
# 分类准确率通常更高；
# 能够有效处理具有高维特征（多元）的数据集，而且不需要降维；
# 在处理大数据集时也具有优势；
# 可应用于具有大量缺失值的数据中；
# 能够在分类的同时度量变量对分类的相对重要性
#随机森林计算（默认生成 500 棵决策树），详情 ?randomForest
 #https://www.jianshu.com/p/8fbe90ee68b7
# 引入包后有个 默认数据集 可以查看使用
data(iris)


set.seed(71)
iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE,
                        proximity=TRUE)
print(iris.rf)
iris

round(importance(iris.rf), 2)
iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE)
op <- par(pty="s")
pairs(cbind(iris[,1:4], iris.mds$points), cex=0.6, gap=0,
      col=c("red", "green", "blue")[as.numeric(iris$Species)],
      main="Iris Data: Predictors and MDS of Proximity Based on RandomForest")
par(op)
print(iris.mds$GOF)

set.seed(123)
otu_train.forest <- randomForest(groups ~ ., data = otu_train, importance = TRUE,proximity=TRUE)
otu_train.forest

plot(margin(otu_train.forest, otu_train$groups), main = '观测值被判断正确的概率图')

#训练集自身测试
train_predict <- predict(otu_train.forest, otu_train)
compare_train <- table(train_predict, otu_train$groups)
compare_train
sum(diag(compare_train)/sum(compare_train))

#使用测试集评估
test_predict <- predict(otu_train.forest, otu_test)
compare_test <- table(otu_test$groups, test_predict, dnn = c('Actual', 'Predicted'))
compare_test

###关键 OTUs 识别
#查看表示每个变量（OTUs）重要性的得分
#summary(otu_train.forest)
importance_otu <- otu_train.forest$importance
head(importance_otu)

#或者使用函数 importance()
importance_otu <- data.frame(importance(otu_train.forest))
head(importance_otu)

#可以根据某种重要性的高低排个序，例如根据“Mean Decrease Accuracy”指标
importance_otu <- importance_otu[order(importance_otu$MeanDecreaseAccuracy, decreasing = TRUE), ]
head(importance_otu)

#输出表格
#write.table(importance_otu, 'importance_otu.txt', sep = '\t', col.names = NA, quote = FALSE)

#作图展示 top30 重要的 OTUs
varImpPlot(otu_train.forest, n.var = min(30, nrow(otu_train.forest$importance)), main = 'Top 30 - variable importance')

###交叉验证帮助选择特定数量的 OTUs
#5 次重复十折交叉验证
set.seed(123)
otu_train.cv <- replicate(5, rfcv(otu_train[-ncol(otu_train)], otu_train$group, cv.fold = 10,step = 1.5), simplify = FALSE)
otu_train.cv

#提取验证结果绘图
otu_train.cv <- data.frame(sapply(otu_train.cv, '[[', 'error.cv'))
otu_train.cv$otus <- rownames(otu_train.cv)
otu_train.cv <- reshape2::melt(otu_train.cv, id = 'otus')
otu_train.cv$otus <- as.numeric(as.character(otu_train.cv$otus))

#拟合线图
library(ggplot2)
library(splines)  #用于在 geom_smooth() 中添加拟合线，或者使用 geom_line() 替代 geom_smooth() 绘制普通折线

p <- ggplot(otu_train.cv, aes(otus, value)) +
    geom_smooth(se = FALSE,	method = 'glm', formula = y~ns(x, 6)) +
    theme(panel.grid = element_blank(), panel.background = element_rect(color = 'black', fill = 'transparent')) +  
    labs(title = '',x = 'Number of OTUs', y = 'Cross-validation error')

p

#大约提取前 30 个重要的 OTUs
p + geom_vline(xintercept = 30)

#根据 OTUs 重要性排序后选择，例如根据“Mean Decrease Accuracy”指标
importance_otu <- importance_otu[order(importance_otu$MeanDecreaseAccuracy, decreasing = TRUE), ]
head(importance_otu)

#输出表格
#write.table(importance_otu[1:30, ], 'importance_otu_top30.txt', sep = '\t', col.names = NA, quote = FALSE)

###简约分类器
#选择 top30 重要的 OTUs，例如上述已经根据“Mean Decrease Accuracy”排名获得
otu_select <- rownames(importance_otu)[1:30]

#数据子集的训练集和测试集
otu_train_top30 <- otu_train[ ,c(otu_select, 'groups')]
otu_test_top30 <- otu_test[ ,c(otu_select, 'groups')]

#随机森林计算（默认生成 500 棵决策树），详情 ?randomForest
set.seed(123)
otu_train.forest_30 <- randomForest(groups ~ ., data = otu_train_top30, importance = TRUE)
otu_train.forest_30

plot(margin(otu_train.forest_30, otu_test_top30$groups), main = '观测值被判断正确的概率图')

#训练集自身测试
train_predict <- predict(otu_train.forest_30, otu_train_top30)
compare_train <- table(train_predict, otu_train_top30$groups)
compare_train

#使用测试集评估
test_predict <- predict(otu_train.forest_30, otu_test_top30)
compare_test <- table(otu_test_top30$groups, test_predict, dnn = c('Actual', 'Predicted'))
compare_test

##NMDS 排序图中展示分类
#NMDS 降维
nmds <- vegan::metaMDS(otu, distance = 'bray') 
result <- nmds$points
result <- as.data.frame(cbind(result, rownames(result)))

#获得上述的分类预测结果
predict_group <- c(train_predict, test_predict)
predict_group <- as.character(predict_group[rownames(result)])

#作图
colnames(result)[1:3] <- c('NMDS1', 'NMDS2', 'samples')
result$NMDS1 <- as.numeric(as.character(result$NMDS1)) 
result$NMDS2 <- as.numeric(as.character(result$NMDS2))
result$samples <- as.character(result$samples)
result <- cbind(result, predict_group)
head(result)

ggplot(result, aes(NMDS1, NMDS2, color = predict_group)) +  
    geom_polygon(data = plyr::ddply(result, 'predict_group', function(df) df[chull(df[[1]], df[[2]]), ]), fill = NA) +
    geom_point()

